{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from mpl_toolkits import mplot3d\n", "import seaborn as sns\n", "import numpy as np\n", "\n", "import scipy.cluster.hierarchy as shc\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "from sklearn.cluster import AgglomerativeClustering\n", "from sklearn.cluster import KMeans\n", "\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import silhouette_score\n", "from sklearn.metrics import silhouette_samples\n", "\n", "from sklearn.decomposition import PCA\n", "\n", "from sklearn import datasets\n", "\n", "%matplotlib inline\n", "pd.set_option(\"display.max_columns\", None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lab 23 - Silhouette score revisited and Principal Components Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the iris dataset, as in previous labs." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "iris_dict = datasets.load_iris()\n", "\n", "iris = pd.DataFrame(iris_dict.data, columns = iris_dict.feature_names)\n", "iris.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Scale the columns of the iris dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "scaler = MinMaxScaler(feature_range=(0, 1))\n", "iris_scaled = scaler.fit_transform(iris)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below is code to compute the silhouette coefficient for each data point, with the number of clusters ranging from 2 to 6." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for k in range(2,7):\n", " # Create a subplot with 1 row and 2 columns\n", " fig, (ax1, ax2) = plt.subplots(1, 2)\n", " fig.set_size_inches(18, 7)\n", "\n", " # The 1st subplot is the silhouette plot\n", " # The silhouette coefficient can range from -1, 1 but in this example all\n", " # lie within [-0.1, 1]\n", " ax1.set_xlim([-1, 1])\n", " # The (n_clusters+1)*10 is for inserting blank space between silhouette\n", " # plots of individual clusters, to demarcate them clearly.\n", " ax1.set_ylim([0, len(iris_scaled) + (k + 1) * 10])\n", "\n", " # Initialize the clusterer with n_clusters value and a random generator\n", " # seed of 10 for reproducibility.\n", " clusterer = KMeans(n_clusters=k)\n", " cluster_labels = clusterer.fit_predict(iris_scaled)\n", "\n", " # The silhouette_score gives the average value for all the samples.\n", " # This gives a perspective into the density and separation of the formed\n", " # clusters\n", " silhouette_avg = silhouette_score(iris_scaled, cluster_labels)\n", " print(\"For n_clusters =\", k,\n", " \"The average silhouette_score is :\", silhouette_avg)\n", "\n", " # Compute the silhouette scores for each sample\n", " sample_silhouette_values = silhouette_samples(iris_scaled, cluster_labels)\n", "\n", " y_lower = 10\n", " for i in range(k):\n", " # Aggregate the silhouette scores for samples belonging to\n", " # cluster i, and sort them\n", " ith_cluster_silhouette_values = \\\n", " sample_silhouette_values[cluster_labels == i]\n", "\n", " ith_cluster_silhouette_values.sort()\n", "\n", " size_cluster_i = ith_cluster_silhouette_values.shape[0]\n", " y_upper = y_lower + size_cluster_i\n", "\n", " color = cm.nipy_spectral(float(i) / k)\n", " ax1.fill_betweenx(np.arange(y_lower, y_upper),\n", " 0, ith_cluster_silhouette_values,\n", " facecolor=color, edgecolor=color, alpha=0.7)\n", "\n", " # Label the silhouette plots with their cluster numbers at the middle\n", " ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))\n", "\n", " # Compute the new y_lower for next plot\n", " y_lower = y_upper + 10 # 10 for the 0 samples\n", "\n", " ax1.set_title(\"The silhouette plot for the various clusters.\")\n", " ax1.set_xlabel(\"The silhouette coefficient values\")\n", " ax1.set_ylabel(\"Cluster label\")\n", "\n", " # The vertical line for average silhouette score of all the values\n", " ax1.axvline(x=silhouette_avg, color=\"red\", linestyle=\"--\")\n", "\n", " ax1.set_yticks([]) # Clear the yaxis labels / ticks\n", " ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])\n", "\n", "\n", " plt.suptitle((\"Silhouette analysis for KMeans clustering on sample data \"\n", " \"with n_clusters = %d\" % k),\n", " fontsize=14, fontweight='bold')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Principal Components Analysis\n", "\n", "Principal Components Analysis or PCA reduces the dimensions of the data set by successively finding the directions with the most variation and using these directions as the new coordinate system.\n", "\n", "The following code finds the first two principal components for the iris dataset:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pca = PCA(n_components=2)\n", "pca.fit(iris)\n", "iris_rotated = pca.transform(iris)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's create a new dataframe with the principal components and the species names." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "iris_rotated_df = pd.DataFrame(iris_rotated, columns = [\"PC1\",\"PC2\"])\n", "iris_rotated_df[\"Species\"] = iris_dict.target\n", "iris_rotated_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Display this dataframe." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use Seaborn to plot a scatter plot where x is PC1 and y is PC2, with the points colored by the species." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's compare this scatter plot with all possible scatter plots from the original data. \n", "\n", "First create a new dataframe with the original iris data and the species names." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now use pairplot() to plot the scatter plots. hue will also work as a parameter here." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the labor dataset." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labor = pd.read_csv(\"../data/Nov2019_labor_market_majors.csv\", skiprows = 13, \\\n", " skipfooter = 3, index_col = \"Major\")\n", "labor[\"Median Wage Early Career\"] = labor[\"Median Wage Early Career\"].str.replace(\",\",\"\").astype(float)\n", "labor[\"Median Wage Mid-Career\"] = labor[\"Median Wage Mid-Career\"].str.replace(\",\",\"\").astype(float)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "labor.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transform the labor data." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Put the data into a dataframe." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "labor_scaled = pd.DataFrame(labor_scaled,columns = labor.columns, index = labor.index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find the first two principal components using the unscaled labor data." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use k-means to cluster the scaled labor data." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a new dataframe with the principal components and the cluster labels." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Plot the colored-by-cluster data using the principal component coordinates." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can even find the first three principal components and plot them in 3D." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pca2 = PCA(n_components=3)\n", "pca2.fit(labor)\n", "labor2_rotated = pca2.transform(labor)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "labor2_rotated_df = pd.DataFrame(labor2_rotated,columns = [\"PC1\",\"PC2\",\"PC3\"])\n", "labor2_rotated_df[\"Cluster\"] = kmeans_clusters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = plt.figure()\n", "ax = plt.axes(projection='3d')\n", "ax.scatter3D(labor2_rotated_df[\"PC1\"], labor2_rotated_df[\"PC2\"], labor2_rotated_df[\"PC3\"]);" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }